File set-up

Set working directory to current directory

if (rstudioapi::isAvailable()) {
  setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
}

Load standard libraries and resolve conflicts

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6      ✔ purrr   0.3.4 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.0      ✔ stringr 1.4.1 
## ✔ readr   2.1.2      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(conflicted)
conflict_prefer("filter", "dplyr")
## [conflicted] Will prefer dplyr::filter over any other package
conflict_prefer("select", "dplyr")
## [conflicted] Will prefer dplyr::select over any other package
conflict_prefer("slice", "dplyr")
## [conflicted] Will prefer dplyr::slice over any other package
conflict_prefer("rename", "dplyr")
## [conflicted] Will prefer dplyr::rename over any other package
conflict_prefer('intersect', 'dplyr')
## [conflicted] Will prefer dplyr::intersect over any other package

Read data

all_circ = read_tsv("../data/Supplementary_Table_2_all_circRNAs.txt")
## Rows: 1137099 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (16): chr, strand, cell_line, tool, circ_id, circ_id_strand, count_group...
## dbl  (7): start, end, BSJ_count, n_detected, n_db, estim_len_in, BSJ_count_m...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
all_circ
cq = read_tsv('../data/Supplementary_Table_3_selected_circRNAs.txt')
## Rows: 1560 Columns: 47
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (26): chr, strand, circ_id, circ_id_strand, tool, cell_line, FWD_primer,...
## dbl (21): start, end, FWD_pos, FWD_length, REV_pos, REV_length, FWD_TM, REV_...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
sens = read_tsv('../data/Supplementary_Table_6B_sensitivity_values.txt')
## Rows: 32 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): tool, count_group_median
## dbl (3): nr_detected, nr_expected, sensitivity
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

How many transcripts have circRNAs?

ENST

all_circ %>%
  filter(!is.na(ENST), !ENST == 'ambiguous') %>%
  select(ENST) %>% unique() %>%
  nrow()
## [1] 17461
# NA or ambiguous => strand needs to be taken into account
all_circ %>% select(circ_id_strand, ENST) %>% unique() %>%
  filter(is.na(ENST) | ENST == 'ambiguous') %>%
  count(ENST)

CircRNA length stats

All circRNAs

with introns

all_circ %>% select(chr, start, end, estim_len_in) %>% unique() %>%
  pull(estim_len_in) %>% quantile()
##     0%    25%    50%    75%   100% 
##     31    454   1660   9925 999911

no introns

all_circ %>% select(chr, start, end, estim_len_ex) %>% unique() %>%
  filter(!(is.na(estim_len_ex) | estim_len_ex == 'ambiguous')) %>%
  mutate(estim_len_ex = as.numeric(estim_len_ex)) %>%
  pull(estim_len_ex) %>% quantile()
##    0%   25%   50%   75%  100% 
##     1   286   498   917 37221

CircRNAs that don’t get validated with RNase R

do not get validated (no introns)

cq %>% filter(qPCR_val == 'pass',
              RR_val == 'fail',
              amp_seq_val == 'pass') %>%
  select(chr, start, end, estim_len_ex) %>% unique() %>%
  filter(!(is.na(estim_len_ex) | estim_len_ex == 'ambiguous')) %>%
  mutate(estim_len_ex = as.numeric(estim_len_ex)) %>%
  pull(estim_len_ex) %>% quantile()
##      0%     25%     50%     75%    100% 
##  115.00 2109.75 2381.50 3483.00 4758.00

BSJ count distribution

all_circ %>% 
  filter(!tool == "Sailfish-cir") %>%
  pull(BSJ_count) %>%
  quantile()
##   0%  25%  50%  75% 100% 
##    1    1    1    3 4007

how many with BSJ < 5

all_circ %>% 
  filter(!tool == "Sailfish-cir") %>%
  count(count_group)
round(945201/(945201+146700), 3)
## [1] 0.866

how many with BSJ >= 2

all_circ %>% 
  filter(!tool == "Sailfish-cir") %>%
  filter(BSJ_count >= 2) %>% nrow()
## [1] 503237
round(503237/(945201+146700), 3)
## [1] 0.461

How many circRNAs are detected on different strands

all_circ %>% 
  filter(!strand == 'unknown') %>%
  select(chr, start, end) %>% unique() %>%
  nrow()
## [1] 174009
all_circ %>% 
  filter(!strand == 'unknown') %>%
  select(chr, start, end, strand) %>% unique() %>%
  nrow()
## [1] 190314
round((190314-174009)/174009, 3)
## [1] 0.094

CircRNA numbers

nr of circ detected per tool

all_circ %>%
  filter(cell_line == 'HLF') %>%
  group_by(tool) %>%
  count() %>%
  arrange(n)

nr of unique circ

all_circ %>% select(circ_id) %>%
  unique()

total nr of circ

nrow(all_circ)
## [1] 1137099